import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fast')
sns.set_style('darkgrid')
Loan_train = pd.read_csv(r'C:\Users\amaresh.murthiraju\Documents\Course Material\ML\06 Decision Tree\Loan Prediction use case\LoanPred_train.csv')
Loan_train.head()
Loan_train.info()
Loan_train.describe()
# Check for missing values
Loan_train.isnull().sum()
sns.pairplot(Loan_train)
# ApplicantIncome 614 non-null int64
# CoapplicantIncome 614 non-null float64
# LoanAmount 592 non-null float64
plt.figure(figsize=(20,10))
plt.subplot(221)
sns.distplot(Loan_train.ApplicantIncome)
plt.subplot(222)
sns.distplot(Loan_train.CoapplicantIncome)
plt.subplot(223)
sns.boxplot(Loan_train.ApplicantIncome)
plt.subplot(224)
sns.boxplot(Loan_train.CoapplicantIncome)
plt.show();
plt.figure(figsize=(20,5))
plt.subplot(121)
sns.distplot(Loan_train[Loan_train.notnull()['LoanAmount']]['LoanAmount'])
plt.subplot(122)
sns.boxplot(Loan_train.LoanAmount)
plt.show();
Loan_train.LoanAmount.mean()
# Gender Married Dependents Education Self_Employed
Loan_train.Dependents.unique()
plt.figure(figsize=(20,15))
plt.subplot(221)
sns.boxplot(x= Loan_train.Gender, y= Loan_train.LoanAmount)
plt.subplot(222)
sns.boxplot(x= Loan_train.Married, y= Loan_train.LoanAmount);
plt.subplot(223)
sns.boxplot(x= Loan_train.Education, y= Loan_train.LoanAmount);
plt.subplot(224)
sns.boxplot(x= Loan_train.Self_Employed, y= Loan_train.LoanAmount);
plt.show();
Loan_train['LoanAmount'].fillna(Loan_train.LoanAmount.mean(),inplace = True)
# Check the Loan amount distribution after filling the missing values
plt.figure(dpi = 100)
sns.distplot(Loan_train.LoanAmount);
# Identifying outliers
# 1. Based on IQR
# i. Values less than Q1 - 1.5*IQR
# ii. Values greater than Q3 + 1.5*IQR
# 2. Values beyond mean +/- 2SD
np.percentile(Loan_train.LoanAmount, [25,75])
Q1, Q3 = np.percentile(Loan_train.LoanAmount, [25,75])
Q1; Q3
IQR = Q3 - Q1; IQR
LL = Q1 - 1.5*IQR
UL = Q3 + 1.5*IQR
LL; UL
# mean +/- 2SD
Loan_train.LoanAmount.std()
Loan_train.LoanAmount.mean() + 2*Loan_train.LoanAmount.std()
Loan_train.LoanAmount.mean() - 2*Loan_train.LoanAmount.std()
Loan_train[(Loan_train.LoanAmount < 3.5) | (Loan_train.LoanAmount > 261.5)]
# transforming the Loan Amount column - Apply log
# plt.figure(dpi = 200)
plt.figure(figsize=(20,7))
plt.subplot(121)
sns.distplot(Loan_train.LoanAmount)
plt.subplot(122)
sns.distplot(np.log(Loan_train.LoanAmount))
plt.show();
Loan_train['LoanAmount_log'] = np.log(Loan_train.LoanAmount)
# Applicant Income & Coapplicant Income
Loan_train['TotalIncome'] = Loan_train.ApplicantIncome + Loan_train.CoapplicantIncome
plt.figure(figsize=(20,7))
plt.subplot(121)
sns.distplot(Loan_train.TotalIncome)
plt.subplot(122)
sns.distplot(np.log(Loan_train.TotalIncome))
plt.show();
Loan_train['TotalIncome_log'] = np.log(Loan_train.TotalIncome)
# dataframe after numerical varlable analysis
Loan_train.head(10)
Loan_train.dtypes.index
Loan_train.dtypes.values
Loan_train.Gender.value_counts()
Loan_train.Married.value_counts()
Loan_train.Dependents.value_counts()
Loan_train.Education.value_counts()
Loan_train.Self_Employed.value_counts()
Loan_train.Credit_History.value_counts()
Loan_train.Property_Area.value_counts()
Loan_train.Loan_Status.value_counts()
Loan_train.Dependents.value_counts().plot(kind='bar');
pd.crosstab(Loan_train.Education, Loan_train.Gender)
pd.crosstab(Loan_train.Education, Loan_train.Gender).plot(kind = 'bar');
pd.crosstab(Loan_train.Education, Loan_train.Gender).plot(kind = 'bar', stacked = True, grid = False);
pd.crosstab([Loan_train.Education, Loan_train.Gender], Loan_train.Loan_Status)
pd.crosstab([Loan_train.Education, Loan_train.Gender], Loan_train.Loan_Status).plot(kind = 'bar');
pd.crosstab([Loan_train.Education, Loan_train.Gender], Loan_train.Loan_Status).plot(kind = 'bar', stacked = True);
# Handle missing values
Loan_train.isna().sum()
missing_cat_cols = ['Gender', 'Married','Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']
Loan_train.Gender.value_counts()
Loan_train.Gender.mode()[0]
for col_name in missing_cat_cols:
Loan_train[col_name].fillna(Loan_train[col_name].mode()[0], inplace=True)
Loan_train.isna().sum()
Loan_train.head(10)
Loan_train_bak = Loan_train.copy()
# Encode the Categorical columns
encode_cols = ['Gender', 'Married','Dependents', 'Education', 'Self_Employed', 'Property_Area','Loan_Status' ]
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in encode_cols:
Loan_train[x] = le.fit_transform(Loan_train[x])
# dataframe after Categorical analysis
Loan_train.head(10)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
Loan_train.columns
feature_cols = ['Gender', 'Married', 'Dependents', 'Education',
'Self_Employed','Loan_Amount_Term', 'Credit_History',
'Property_Area','LoanAmount_log','TotalIncome_log']
X = Loan_train[feature_cols]
y = Loan_train.Loan_Status
DT_model = DecisionTreeClassifier()
DT_model.fit(X,y)
pred = DT_model.predict(X)
accuracy_score(y, pred)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)
X_train.shape
X_test.shape
DT_model1 = DecisionTreeClassifier()
DT_model1.fit(X_train, y_train)
pred_X_train = DT_model1.predict(X_train)
accuracy_score(y_train, pred_X_train)
pred_X_test = DT_model1.predict(X_test)
accuracy_score(y_test, pred_X_test)
#RFE
from sklearn.feature_selection import RFE
feature_cols = ['Gender', 'Married', 'Dependents', 'Education',
'Self_Employed','Loan_Amount_Term', 'Credit_History',
'Property_Area','LoanAmount_log','TotalIncome_log']
X = Loan_train[feature_cols]
y = Loan_train.Loan_Status
model = DecisionTreeClassifier()
rfe = RFE(model, 4)
fit = rfe.fit(X, y)
print(fit.n_features_)
print(fit.support_)
print(fit.ranking_)
for i, x in enumerate(fit.ranking_):
if(x == 1):
print(X.columns[i])
# Create a new DT model with selected features
feature_cols = ['Dependents','Credit_History', 'LoanAmount_log','TotalIncome_log']
X = Loan_train[feature_cols]
y = Loan_train.Loan_Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)
X_train.shape
X_test.shape
DT_model2 = DecisionTreeClassifier()
DT_model2.fit(X_train, y_train)
pred_X_train = DT_model2.predict(X_train)
accuracy_score(y_train, pred_X_train)
pred_X_test = DT_model2.predict(X_test)
accuracy_score(y_test, pred_X_test)
You need to install pydotplus and graphviz. These can be installed with your package manager and pip. Graphviz is a tool for drawing graphics using dot files. Pydotplus is a module to Graphviz’s Dot language.
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
import os
os.getcwd()
# Create DOT data
#export_graphviz(DT_model2, out_file='tree_bw.dot',
#feature_names=feature_cols2)
dot_data = export_graphviz(DT_model2, out_file=None,
feature_names=feature_cols)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
import collections
# Create DOT data
#export_graphviz(DT_model2, out_file='tree_color.dot',
#feature_names=feature_cols2,filled=True,\
#rounded=True)
dot_data = export_graphviz(DT_model2, out_file=None,
feature_names=feature_cols,filled=True,\
rounded=True, class_names=['No','Yes'], )
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
colors = ('yellow', 'grey')
edges = collections.defaultdict(list)
for edge in graph.get_edge_list():
edges[edge.get_source()].append(int(edge.get_destination()))
for edge in edges:
edges[edge].sort()
for i in range(2):
dest = graph.get_node(str(edges[edge][i]))[0]
dest.set_fillcolor(colors[i])
# Show graph
Image(graph.create_png())
graph.write_png('tree.png')
dot_data = StringIO()
export_graphviz(DT_model2, out_file=dot_data,
feature_names=feature_cols,
class_names=['No','Yes'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
graph.write_png('tree.png')